package com.scrawler.tool;


public class UrlFilter {
	public static SimpleBloomFilter bloom = new SimpleBloomFilter();
	//禁用符号
	private static String [] forbidWords = {"photo","pic","video","/tv/",
			"/tupian/","/games.","pic."
			};	
	
	public static boolean matchUrl(String url)
	{
		for(String forbid : forbidWords)
		{
			if(url.contains(forbid))
				return false;
		}
		// 一级链接匹配规则	
		if(!url.matches("^http://.*\\d+.shtml") && !url.matches("^http://.*\\d+.html"))			
		{
			return false;															
		}	
		if(bloom.contains(url))	// 过滤掉每一张页面重复的链接
		{
			return false;
			// bloom.add(url);
			// CrawlLink.urlTextMap.put(url,title);
		}		
		return true;
	}
	
	
		
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
	}
}
